/**
 * @file flip_avx.c
 *
 * This module deals with flipping discs.
 *
 * A function is provided for each square of the board. 
 * For LSB to MSB direction, carry propagation can be used to determine
 * contiguous opponent discs.
 * For MSB to LSB direction, sequencial search with parallel prefix
 * is used.
 *
 * @date 1998 - 2014
 * @author Toshihiko Okuhara
 * @version 4.4
 */

#include <x86intrin.h>
#include "bit.h"

static const __v4di lmask[66] = {
	{ ~0x0000000000000000ULL, ~0x0101010101010100ULL, ~0x8040201008040200ULL, ~0x00000000000000feULL },
	{ ~0x0000000000000100ULL, ~0x0202020202020200ULL, ~0x0080402010080400ULL, ~0x00000000000000fcULL },
	{ ~0x0000000000010200ULL, ~0x0404040404040400ULL, ~0x0000804020100800ULL, ~0x00000000000000f8ULL },
	{ ~0x0000000001020400ULL, ~0x0808080808080800ULL, ~0x0000008040201000ULL, ~0x00000000000000f0ULL },
	{ ~0x0000000102040800ULL, ~0x1010101010101000ULL, ~0x0000000080402000ULL, ~0x00000000000000e0ULL },
	{ ~0x0000010204081000ULL, ~0x2020202020202000ULL, ~0x0000000000804000ULL, ~0x00000000000000c0ULL },
	{ ~0x0001020408102000ULL, ~0x4040404040404000ULL, ~0x0000000000008000ULL, ~0x0000000000000080ULL },
	{ ~0x0102040810204000ULL, ~0x8080808080808000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0101010101010000ULL, ~0x4020100804020000ULL, ~0x000000000000fe00ULL },
	{ ~0x0000000000010000ULL, ~0x0202020202020000ULL, ~0x8040201008040000ULL, ~0x000000000000fc00ULL },
	{ ~0x0000000001020000ULL, ~0x0404040404040000ULL, ~0x0080402010080000ULL, ~0x000000000000f800ULL },
	{ ~0x0000000102040000ULL, ~0x0808080808080000ULL, ~0x0000804020100000ULL, ~0x000000000000f000ULL },
	{ ~0x0000010204080000ULL, ~0x1010101010100000ULL, ~0x0000008040200000ULL, ~0x000000000000e000ULL },
	{ ~0x0001020408100000ULL, ~0x2020202020200000ULL, ~0x0000000080400000ULL, ~0x000000000000c000ULL },
	{ ~0x0102040810200000ULL, ~0x4040404040400000ULL, ~0x0000000000800000ULL, ~0x0000000000008000ULL },
	{ ~0x0204081020400000ULL, ~0x8080808080800000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0101010101000000ULL, ~0x2010080402000000ULL, ~0x0000000000fe0000ULL },
	{ ~0x0000000001000000ULL, ~0x0202020202000000ULL, ~0x4020100804000000ULL, ~0x0000000000fc0000ULL },
	{ ~0x0000000102000000ULL, ~0x0404040404000000ULL, ~0x8040201008000000ULL, ~0x0000000000f80000ULL },
	{ ~0x0000010204000000ULL, ~0x0808080808000000ULL, ~0x0080402010000000ULL, ~0x0000000000f00000ULL },
	{ ~0x0001020408000000ULL, ~0x1010101010000000ULL, ~0x0000804020000000ULL, ~0x0000000000e00000ULL },
	{ ~0x0102040810000000ULL, ~0x2020202020000000ULL, ~0x0000008040000000ULL, ~0x0000000000c00000ULL },
	{ ~0x0204081020000000ULL, ~0x4040404040000000ULL, ~0x0000000080000000ULL, ~0x0000000000800000ULL },
	{ ~0x0408102040000000ULL, ~0x8080808080000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0101010100000000ULL, ~0x1008040200000000ULL, ~0x00000000fe000000ULL },
	{ ~0x0000000100000000ULL, ~0x0202020200000000ULL, ~0x2010080400000000ULL, ~0x00000000fc000000ULL },
	{ ~0x0000010200000000ULL, ~0x0404040400000000ULL, ~0x4020100800000000ULL, ~0x00000000f8000000ULL },
	{ ~0x0001020400000000ULL, ~0x0808080800000000ULL, ~0x8040201000000000ULL, ~0x00000000f0000000ULL },
	{ ~0x0102040800000000ULL, ~0x1010101000000000ULL, ~0x0080402000000000ULL, ~0x00000000e0000000ULL },
	{ ~0x0204081000000000ULL, ~0x2020202000000000ULL, ~0x0000804000000000ULL, ~0x00000000c0000000ULL },
	{ ~0x0408102000000000ULL, ~0x4040404000000000ULL, ~0x0000008000000000ULL, ~0x0000000080000000ULL },
	{ ~0x0810204000000000ULL, ~0x8080808000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0101010000000000ULL, ~0x0804020000000000ULL, ~0x000000fe00000000ULL },
	{ ~0x0000010000000000ULL, ~0x0202020000000000ULL, ~0x1008040000000000ULL, ~0x000000fc00000000ULL },
	{ ~0x0001020000000000ULL, ~0x0404040000000000ULL, ~0x2010080000000000ULL, ~0x000000f800000000ULL },
	{ ~0x0102040000000000ULL, ~0x0808080000000000ULL, ~0x4020100000000000ULL, ~0x000000f000000000ULL },
	{ ~0x0204080000000000ULL, ~0x1010100000000000ULL, ~0x8040200000000000ULL, ~0x000000e000000000ULL },
	{ ~0x0408100000000000ULL, ~0x2020200000000000ULL, ~0x0080400000000000ULL, ~0x000000c000000000ULL },
	{ ~0x0810200000000000ULL, ~0x4040400000000000ULL, ~0x0000800000000000ULL, ~0x0000008000000000ULL },
	{ ~0x1020400000000000ULL, ~0x8080800000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0101000000000000ULL, ~0x0402000000000000ULL, ~0x0000fe0000000000ULL },
	{ ~0x0001000000000000ULL, ~0x0202000000000000ULL, ~0x0804000000000000ULL, ~0x0000fc0000000000ULL },
	{ ~0x0102000000000000ULL, ~0x0404000000000000ULL, ~0x1008000000000000ULL, ~0x0000f80000000000ULL },
	{ ~0x0204000000000000ULL, ~0x0808000000000000ULL, ~0x2010000000000000ULL, ~0x0000f00000000000ULL },
	{ ~0x0408000000000000ULL, ~0x1010000000000000ULL, ~0x4020000000000000ULL, ~0x0000e00000000000ULL },
	{ ~0x0810000000000000ULL, ~0x2020000000000000ULL, ~0x8040000000000000ULL, ~0x0000c00000000000ULL },
	{ ~0x1020000000000000ULL, ~0x4040000000000000ULL, ~0x0080000000000000ULL, ~0x0000800000000000ULL },
	{ ~0x2040000000000000ULL, ~0x8080000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0100000000000000ULL, ~0x0200000000000000ULL, ~0x00fe000000000000ULL },
	{ ~0x0100000000000000ULL, ~0x0200000000000000ULL, ~0x0400000000000000ULL, ~0x00fc000000000000ULL },
	{ ~0x0200000000000000ULL, ~0x0400000000000000ULL, ~0x0800000000000000ULL, ~0x00f8000000000000ULL },
	{ ~0x0400000000000000ULL, ~0x0800000000000000ULL, ~0x1000000000000000ULL, ~0x00f0000000000000ULL },
	{ ~0x0800000000000000ULL, ~0x1000000000000000ULL, ~0x2000000000000000ULL, ~0x00e0000000000000ULL },
	{ ~0x1000000000000000ULL, ~0x2000000000000000ULL, ~0x4000000000000000ULL, ~0x00c0000000000000ULL },
	{ ~0x2000000000000000ULL, ~0x4000000000000000ULL, ~0x8000000000000000ULL, ~0x0080000000000000ULL },
	{ ~0x4000000000000000ULL, ~0x8000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0xfe00000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0xfc00000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0xf800000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0xf000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0xe000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0xc000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x8000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL },	// pass
	{ ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL, ~0x0000000000000000ULL }
};

#ifndef __x86_64__
static inline __v2di _mm_cvtsi64_si128(const unsigned long long x) {
	return _mm_unpacklo_epi32(_mm_cvtsi32_si128(x), _mm_cvtsi32_si128(x >> 32));
}
#endif

#define	SWAP64	0x4e	// for _mm_shuffle_epi32

/**
 * Make inverted flip mask if opponent's disc are surrounded by player's.
 *
 * -1 if outflank is 0
 *  0 if a 1 is in 64 bit
 */
static inline __v4di flipmask (__v4di outflank) {
	return _mm256_cmpeq_epi64(outflank, _mm256_setzero_si256());
}

/**
 * Compute flipped discs when playing on square pos.
 *
 * @param pos player's move.
 * @param P player's disc pattern.
 * @param O opponent's disc pattern.
 * @return flipped disc pattern.
 */

unsigned long long flip(int pos, const unsigned long long P, const unsigned long long O)
{
	__v4di	PP, mOO, outflank, flip, shift2, pre, mask;
	__v2di	flip2;
	static const __v4di	minusone = { -1, -1, -1, -1 };
	static const __v4di	shift1 = { 7, 8, 9, 1 };
	static const __v4di	mflipH = { 0x7e7e7e7e7e7e7e7eULL, 0xffffffffffffffffULL, 0x7e7e7e7e7e7e7e7eULL, 0x7e7e7e7e7e7e7e7eULL };

	PP = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(P));
	mOO = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(O)) & mflipH;

	flip  = _mm256_broadcastq_epi64(_mm_loadl_epi64((__m128i *) &X_TO_BIT[pos]));
	flip  = mOO & _mm256_srlv_epi64(flip, shift1);
	flip |= mOO & _mm256_srlv_epi64(flip, shift1);
	pre   = mOO & _mm256_srlv_epi64(mOO, shift1);
	shift2 = shift1 + shift1;
	flip |= pre & _mm256_srlv_epi64(flip, shift2);
	flip |= pre & _mm256_srlv_epi64(flip, shift2);
	outflank = _mm256_srlv_epi64(flip, shift1) & PP;
	flip = _mm256_andnot_si256(flipmask(outflank), flip);

	mask = lmask[pos];
	outflank = _mm256_andnot_si256(mask, ((mOO | mask) - minusone) & PP);
	flip |= _mm256_andnot_si256(mask, outflank - (flipmask(outflank) - minusone));

	flip2 = _mm256_castsi256_si128(flip) | _mm256_extracti128_si256(flip, 1);
	flip2 |= _mm_shuffle_epi32(flip2, SWAP64);

	return flip2[0];
}
